Build your own recommendation system for products on an e-commerce website like Amazon.com.
Online E-commerce websites like Amazon, Filpkart uses different recommendation models to provide different suggestions to different users.
Amazon currently uses item-to-item collaborative filtering, which scales to massive data sets and produces high-quality recommendations in real time. This type of filtering matches each of the user's purchased and rated items to similar items, then combines those similar items into a recommendation list for the user.
In this project we are going to build recommendation model for the electronics products of Amazon.
The dataset here is taken from the below website.
Source - Amazon Reviews data (http://jmcauley.ucsd.edu/data/amazon/) The repository has several datasets. For this case study, we are using the Electronics dataset.
Dataset columns - first three columns are userId, productId, and ratings and the fourth column is timestamp. You can discard the timestamp column as in this case you may not need to use it.
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, evaluate
from surprise import SVD, KNNBaseline, KNNBasic, NMF, NormalPredictor, BaselineOnly, KNNWithMeans
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from collections import defaultdict
from surprise import accuracy
import matplotlib.pyplot as plt
%matplotlib inline
reader = Reader()
dfData = pd.read_csv('ratings_Electronics.csv', names=['userId','productId','ratings','timestamp'])
dfData.head()
dfData.shape
dfWithoutTimeStamp = dfData.drop(['timestamp'], axis=1)
dfWithoutTimeStamp.head()
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
data = dfWithoutTimeStamp['ratings'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
text = ['{:.1f} %'.format(val) for val in (data.values / dfWithoutTimeStamp.shape[0] * 100)],
textposition = 'auto',
textfont = dict(color = '#000000'),
y = data.values,
)
# Create layout
layout = dict(title = 'Distribution Of {} Ratings'.format(dfWithoutTimeStamp.shape[0]),
xaxis = dict(title = 'Rating'),
yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
data = dfWithoutTimeStamp.groupby('productId')['ratings'].count().clip(upper=50)
# Create trace
trace = go.Histogram(x = data.values,
name = 'Ratings',
xbins = dict(start = 0,
end = 50,
size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Product',
xaxis = dict(title = 'Number of Ratings Per Product'),
yaxis = dict(title = 'Count'),
bargap = 0.2)
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)
dfWithoutTimeStamp.groupby('productId')['ratings'].count().reset_index().sort_values('ratings', ascending=False)[:10]
data = dfWithoutTimeStamp.groupby('userId')['ratings'].count().clip(upper=50)
# Create trace
trace = go.Histogram(x = data.values,
name = 'Ratings',
xbins = dict(start = 0,
end = 50,
size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User',
xaxis = dict(title = 'Ratings Per User'),
yaxis = dict(title = 'Count'),
bargap = 0.2)
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)